{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SQLContext\n",
    "sc = SparkContext()\n",
    "sqlContext = SQLContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.functions import udf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "l = [('Alice', 16),('john', 82),('george', 13),('mark', 24)]\n",
    "df = sqlContext.createDataFrame(l,['name', 'age'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+---+\n",
      "|  name|age|\n",
      "+------+---+\n",
      "| Alice| 16|\n",
      "|  john| 82|\n",
      "|george| 13|\n",
      "|  mark| 24|\n",
      "+------+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "maturity_udf = udf(lambda age: \"adult\" if age >=18 else \"child\", StringType())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df1 = df.withColumn(\"maturity\", maturity_udf(df.age))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- name: string (nullable = true)\n",
      " |-- age: long (nullable = true)\n",
      " |-- maturity: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df1.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+---+--------+\n",
      "|  name|age|maturity|\n",
      "+------+---+--------+\n",
      "| Alice| 16|   child|\n",
      "|  john| 82|   adult|\n",
      "|george| 13|   child|\n",
      "|  mark| 24|   adult|\n",
      "+------+---+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df1.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def condition(r):\n",
    "    if (r < 1):\n",
    "        r = \"infant\" \n",
    "    elif(1<=r <=18):\n",
    "        r = \"child\"\n",
    "    else: \n",
    "        r = \"adult\" \n",
    "    return r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#maturity_udf = udf(lambda age: \"adult\" if age >=18 else \"child\", StringType())\n",
    "maturity_udf = udf(lambda x: condition(x), StringType())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df2 = df.withColumn(\"maturity\", maturity_udf(df.age))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- name: string (nullable = true)\n",
      " |-- age: long (nullable = true)\n",
      " |-- maturity: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df2.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+---+--------+\n",
      "|  name|age|maturity|\n",
      "+------+---+--------+\n",
      "| Alice| 16|   child|\n",
      "|  john| 82|   adult|\n",
      "|george| 13|   child|\n",
      "|  mark| 24|   adult|\n",
      "+------+---+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df2.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
